knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
# set working directory
setwd("C:/Users/vo_j4/Desktop/github data")
# load datasets
IGM_data <-read.csv("GDIM_2023_03.csv")
gini_data <- read.csv("gini 1991-2017 feb 22, 2024.csv")
# load packages
library(leaps)
library(dplyr)
library(ggplot2)
library(ggforce)
library(stats)
library(ggalt)
library(plot3D)
library(plotly)
library(gridExtra)
library(cluster)
library(factoextra)
library(caret)
head(gini_data)
# Rename the variables
colnames(gini_data)[colnames(gini_data) == "X1991..YR1991."] <- "1991"
colnames(gini_data)[colnames(gini_data) == "X1992..YR1992."] <- "1992"
colnames(gini_data)[colnames(gini_data) == "X1993..YR1993."] <- "1993"
colnames(gini_data)[colnames(gini_data) == "X1994..YR1994."] <- "1994"
colnames(gini_data)[colnames(gini_data) == "X1995..YR1995."] <- "1995"
colnames(gini_data)[colnames(gini_data) == "X1996..YR1996."] <- "1996"
colnames(gini_data)[colnames(gini_data) == "X1997..YR1997."] <- "1997"
colnames(gini_data)[colnames(gini_data) == "X1998..YR1998."] <- "1998"
colnames(gini_data)[colnames(gini_data) == "X1999..YR1999."] <- "1999"
colnames(gini_data)[colnames(gini_data) == "X2000..YR2000."] <- "2000"
colnames(gini_data)[colnames(gini_data) == "X2001..YR2001."] <- "2001"
colnames(gini_data)[colnames(gini_data) == "X2002..YR2002."] <- "2002"
colnames(gini_data)[colnames(gini_data) == "X2003..YR2003."] <- "2003"
colnames(gini_data)[colnames(gini_data) == "X2004..YR2004."] <- "2004"
colnames(gini_data)[colnames(gini_data) == "X2005..YR2005."] <- "2005"
colnames(gini_data)[colnames(gini_data) == "X2006..YR2006."] <- "2006"
colnames(gini_data)[colnames(gini_data) == "X2007..YR2007."] <- "2007"
colnames(gini_data)[colnames(gini_data) == "X2008..YR2008."] <- "2008"
colnames(gini_data)[colnames(gini_data) == "X2009..YR2009."] <- "2009"
colnames(gini_data)[colnames(gini_data) == "X2010..YR2010."] <- "2010"
colnames(gini_data)[colnames(gini_data) == "X2011..YR2011."] <- "2011"
colnames(gini_data)[colnames(gini_data) == "X2012..YR2012."] <- "2012"
colnames(gini_data)[colnames(gini_data) == "X2013..YR2013."] <- "2013"
colnames(gini_data)[colnames(gini_data) == "X2014..YR2014."] <- "2014"
colnames(gini_data)[colnames(gini_data) == "X2015..YR2015."] <- "2015"
colnames(gini_data)[colnames(gini_data) == "X2016..YR2016."] <- "2016"
colnames(gini_data)[colnames(gini_data) == "X2017..YR2017."] <- "2017"
# empty NAs cells instead of ",,"
columns_to_process <- colnames(gini_data)[5:ncol(gini_data)] ## specify the columns to be processed (from "1991" to "2017")
for (col in columns_to_process) { ## loop through each specified column and replace ".." with ""
gini_data[gini_data[[col]] == "..", col] <- ""
gini_data[[col]] <- as.numeric(gini_data[[col]])
}
library(dplyr)
# group by country and calculate the mean for numerical variables in columns 16 to 90
numerical <- IGM_data %>%
group_by(country) %>%
summarize(across(15:89, ~mean(., na.rm = TRUE)))
# transfer over categorical variables in columns 1 to 12 because they're all identical
categorical <- IGM_data %>%
group_by(country) %>%
summarize(across(0:11, ~first(.)))
# merge 2 datasets into df
df <- merge( categorical,numerical, by = "country")
# add a new variable "gini" to df
df$gini <- NA
for (i in 1:nrow(df)) {
# extract year, country, code columns
country_code <- df$code[i]
temp_year <- as.character(df$year[i])
# find corresponding column in gini_data
col_name <- temp_year # Assuming column names are just the years
# subset gini_data based on country code
matching_rows <- gini_data$Country.Code == country_code
# check if there are any matching rows
if (any(matching_rows)) {
# match the Country.Code and assign the value to "gini"
df$gini[i] <- gini_data[matching_rows, col_name]
} else {
# if no matching rows, assign NA
df$gini[i] <- "none"
}
}
df$gini
## [1] NA "33.7" "42.7" "41.1" "32.5" NA "30.8" NA "32.1" "25.3"
## [11] "27.6" "43.4" "40.9" "50.8" NA "53.3" "52" "36" "39.8" "38.6"
## [21] "47.2" NA "42.8" "33.2" "56.2" "43.3" "45.8" "42.2" "52.6" "45.3"
## [31] "42.1" "48.9" "48.3" "43.2" "30.9" "34.3" "25.4" "28.4" "41.6" "42.2"
## [41] "46.9" "28.3" "38" "31.2" "51.5" NA "40.4" "27.1" "31.9" "38"
## [51] "35.9" "36.6" "31.4" "42.4" "35" "48.3" "43" "50.7" "41.1" "49.4"
## [61] "30.3" "27.2" "35.7" "40.2" "38.8" "29.5" "32.8" "39" "35.2" NA
## [71] "33.7" "27.2" NA "37" "31.2" "26.7" "26.8" "36" "34.3" "31.8"
## [81] "44.9" "33.2" "38.4" "42.6" NA "41.1" "38.4" NA "37.7" "38.5"
## [91] NA "26.3" "32.3" "38.5" "40.7" "45.6" "38.1" "59.1" NA "28.2"
## [101] NA NA "34.3" "35.5" "34.5" "28.5" NA "52.7" "41.9" "48.5"
## [111] "43.1" NA "31.2" "35.2" "34.4" "36.8" "48.5" "30.8" "40.3" "38.8"
## [121] "34" "26.1" "24.8" NA "63" "46.3" "35.8" "38.7" "35.4" "29.6"
## [131] "33" "none" NA NA "39.3" "27.8" "43.1" "37.5" NA "41.9"
## [141] "39.1" NA "24.7" "33.1" "41.2" "39.5" NA "37.4" NA "35.6"
## [151] "34.4" "36.7" "52"
# quick look at gini
summary(df$gini)
## Length Class Mode
## 153 character character
# see which countries had NA value and which year that was
na_countries <- data.frame(cbind(df[is.na(df$gini), c("country", "year")]))
na_countries
# after manually select the appropriate value for each of those NAs,add them to our dataset
# reference Methods section and README file for further details
NA_countries <- c("Afghanistan", "Australia", "Azerbaijan", "Bosnia and Herzegovina", "Cambodia", "Ethiopia", "Japan", "Kenya", "Malawi", "Mali", "Mexico", "Nepal", "New Zealand", "Nicaragua", "Pakistan", "Philippines", "Solomon Islands", "Taiwan, China", "Tajikistan", "Tanzania", "Tunisia", "Uganda", "Uzbekistan", "Venezuela, RB")
NA_gini <- c(31.6, 32.3, 26.6, 33, 30.76, 35, 32.9, 40.8, 45.5, 33, 47.2, 32.8, 36.2, 46.2, 28.7, 47.7, 37.1,33.8, 34, 37.8, 32.8, 41, 31.2, 39)
# match country names with corresponding gini values
for (i in 1:length(NA_countries)) {
df$gini[df$country == NA_countries[i]] <- NA_gini[i]
}
df$gini <- as.numeric(df$gini)
# checking correlation between all BH variables
cor(df[47:58])
## BHQ1_randomtiebreak BHQ1_lb BHQ1_ub
## BHQ1_randomtiebreak 1.00000000 0.8454426 0.07490105
## BHQ1_lb 0.84544256 1.0000000 -0.40357163
## BHQ1_ub 0.07490105 -0.4035716 1.00000000
## BHQ2_randomtiebreak 0.29824233 0.2037934 0.08968521
## BHQ2_lb 0.51270699 0.8590112 -0.73658963
## BHQ2_ub -0.50421392 -0.8366010 0.75558397
## BHQ3_randomtiebreak -0.84575565 -0.7028628 -0.09771206
## BHQ3_lb 0.42063623 0.7531698 -0.74055337
## BHQ3_ub -0.77857303 -0.9401619 0.46896606
## BHQ4_randomtiebreak -0.81292607 -0.6657970 -0.08069578
## BHQ4_lb 0.20224092 0.4974151 -0.67837860
## BHQ4_ub -0.87037047 -0.8079332 0.05811190
## BHQ2_randomtiebreak BHQ2_lb BHQ2_ub
## BHQ1_randomtiebreak 0.29824233 0.5127070 -0.5042139
## BHQ1_lb 0.20379338 0.8590112 -0.8366010
## BHQ1_ub 0.08968521 -0.7365896 0.7555840
## BHQ2_randomtiebreak 1.00000000 0.2309542 0.1253588
## BHQ2_lb 0.23095416 1.0000000 -0.8972863
## BHQ2_ub 0.12535879 -0.8972863 1.0000000
## BHQ3_randomtiebreak -0.36135509 -0.4253365 0.3612155
## BHQ3_lb -0.09426237 0.8825828 -0.9550175
## BHQ3_ub -0.17269690 -0.8761475 0.8823016
## BHQ4_randomtiebreak -0.71718143 -0.4677107 0.2849251
## BHQ4_lb -0.36429043 0.6282982 -0.8185536
## BHQ4_ub -0.59400394 -0.6242911 0.4790428
## BHQ3_randomtiebreak BHQ3_lb BHQ3_ub
## BHQ1_randomtiebreak -0.84575565 0.42063623 -0.7785730
## BHQ1_lb -0.70286277 0.75316985 -0.9401619
## BHQ1_ub -0.09771206 -0.74055337 0.4689661
## BHQ2_randomtiebreak -0.36135509 -0.09426237 -0.1726969
## BHQ2_lb -0.42533651 0.88258275 -0.8761475
## BHQ2_ub 0.36121554 -0.95501754 0.8823016
## BHQ3_randomtiebreak 1.00000000 -0.21675476 0.6941948
## BHQ3_lb -0.21675476 1.00000000 -0.8297685
## BHQ3_ub 0.69419481 -0.82976847 1.0000000
## BHQ4_randomtiebreak 0.58650480 -0.29295825 0.5770133
## BHQ4_lb -0.23982120 0.78868297 -0.6987643
## BHQ4_ub 0.64124433 -0.48468645 0.7410092
## BHQ4_randomtiebreak BHQ4_lb BHQ4_ub
## BHQ1_randomtiebreak -0.81292607 0.20224092 -0.87037047
## BHQ1_lb -0.66579698 0.49741507 -0.80793317
## BHQ1_ub -0.08069578 -0.67837860 0.05811190
## BHQ2_randomtiebreak -0.71718143 -0.36429043 -0.59400394
## BHQ2_lb -0.46771074 0.62829819 -0.62429114
## BHQ2_ub 0.28492515 -0.81855355 0.47904275
## BHQ3_randomtiebreak 0.58650480 -0.23982120 0.64124433
## BHQ3_lb -0.29295825 0.78868297 -0.48468645
## BHQ3_ub 0.57701334 -0.69876430 0.74100924
## BHQ4_randomtiebreak 1.00000000 0.12638517 0.96023901
## BHQ4_lb 0.12638517 1.00000000 -0.08160403
## BHQ4_ub 0.96023901 -0.08160403 1.00000000
ggplot(as.data.frame(as.table(cor(df[47:58]))), aes(Var1, Var2, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
theme_minimal() +
labs(title = "BH Variables Correlation Heatmap", x = "", y = "")
# checking correlation between all TM variables
cor(df[63:87])
## tm11 tm12 tm13 tm14 tm15
## tm11 1.00000000 -0.192668751 -0.549123110 -0.6837711482 -0.53637556
## tm12 -0.19266875 1.000000000 -0.162739052 -0.1542671357 -0.24399566
## tm13 -0.54912311 -0.162739052 1.000000000 0.0073007649 0.14703920
## tm14 -0.68377115 -0.154267136 0.007300765 1.0000000000 0.39694724
## tm15 -0.53637556 -0.243995665 0.147039199 0.3969472444 1.00000000
## tm21 0.88900218 -0.261230000 -0.436943848 -0.5930346174 -0.46827871
## tm22 0.27495025 0.720399237 -0.380058988 -0.4265771125 -0.38468978
## tm23 -0.35566401 -0.107416032 0.877386001 -0.1456100634 -0.02082919
## tm24 -0.67779560 -0.004532989 0.007239148 0.8946297617 0.36885240
## tm25 -0.29779725 -0.207520754 -0.033273027 0.2868576820 0.73671092
## tm31 0.84060840 -0.249399414 -0.440937462 -0.5252967002 -0.45906778
## tm32 0.53505474 0.418080046 -0.434948380 -0.5436615053 -0.43715553
## tm33 -0.08510869 -0.153871526 0.680180851 -0.2847389662 -0.17673760
## tm34 -0.60825509 0.077954886 0.027299101 0.7817299071 0.21585849
## tm35 -0.14808332 0.039741878 -0.032928849 -0.0334161351 0.53632580
## tm41 0.74380942 -0.190957881 -0.412638014 -0.4766317313 -0.38530014
## tm42 0.51640368 0.195707604 -0.330470559 -0.4936522945 -0.34316675
## tm43 0.19723799 -0.134398176 0.410160685 -0.4830180660 -0.21689899
## tm44 -0.51813867 0.072524705 0.015177891 0.7126880000 0.09715910
## tm45 -0.18815775 0.034068154 0.080297336 -0.0007170447 0.40154090
## tm51 0.68271292 -0.152852996 -0.397453237 -0.4292570209 -0.36858968
## tm52 0.58496607 0.078816756 -0.429421568 -0.4240138010 -0.34852914
## tm53 0.34626730 -0.098949615 0.166250744 -0.4773380155 -0.30907783
## tm54 -0.28151009 0.069732827 -0.020110545 0.4537792325 -0.07149219
## tm55 -0.47180422 0.052139616 0.204095525 0.2819801404 0.48931456
## tm21 tm22 tm23 tm24 tm25 tm31
## tm11 0.889002181 0.27495025 -0.35566401 -0.677795596 -0.29779725 0.84060840
## tm12 -0.261230000 0.72039924 -0.10741603 -0.004532989 -0.20752075 -0.24939941
## tm13 -0.436943848 -0.38005899 0.87738600 0.007239148 -0.03327303 -0.44093746
## tm14 -0.593034617 -0.42657711 -0.14561006 0.894629762 0.28685768 -0.52529670
## tm15 -0.468278712 -0.38468978 -0.02082919 0.368852401 0.73671092 -0.45906778
## tm21 1.000000000 0.11279998 -0.33097968 -0.676304518 -0.35167165 0.87396046
## tm22 0.112799981 1.00000000 -0.27269379 -0.371649023 -0.36381178 0.11980013
## tm23 -0.330979681 -0.27269379 1.00000000 -0.180547912 -0.21724112 -0.30734039
## tm24 -0.676304518 -0.37164902 -0.18054791 1.000000000 0.24545329 -0.57209409
## tm25 -0.351671649 -0.36381178 -0.21724112 0.245453292 1.00000000 -0.33934797
## tm31 0.873960464 0.11980013 -0.30734039 -0.572094086 -0.33934797 1.00000000
## tm32 0.444573506 0.77018052 -0.31165620 -0.554585290 -0.33577669 0.39202326
## tm33 0.001492836 -0.07140452 0.75994192 -0.413312529 -0.32437743 -0.06509326
## tm34 -0.607570660 -0.29783185 -0.07694551 0.912657690 0.02464586 -0.53910874
## tm35 -0.212717581 -0.05167199 -0.15572473 -0.044798335 0.77690344 -0.33424709
## tm41 0.774520260 0.13730328 -0.26723534 -0.547575688 -0.27479792 0.87051875
## tm42 0.444337972 0.59618013 -0.24730151 -0.524597824 -0.27357423 0.45884572
## tm43 0.252595852 0.06446007 0.53518905 -0.599791436 -0.30218003 0.22126395
## tm44 -0.501308413 -0.21686862 -0.04097268 0.801598842 -0.13344450 -0.43137789
## tm45 -0.231542857 -0.12911678 -0.05865449 0.030946113 0.62844510 -0.35042026
## tm51 0.750786407 0.13556447 -0.32371901 -0.484544643 -0.25410238 0.71656721
## tm52 0.523567583 0.47380970 -0.35252887 -0.438862420 -0.26943473 0.51213840
## tm53 0.435647079 0.14828763 0.26335535 -0.590448036 -0.36161706 0.39899581
## tm54 -0.286708374 -0.07386091 -0.06894878 0.558449808 -0.24468997 -0.16962722
## tm55 -0.532071436 -0.23483382 0.12564218 0.306337574 0.56775276 -0.57185231
## tm32 tm33 tm34 tm35 tm41 tm42
## tm11 0.5350547 -0.085108688 -0.60825509 -0.14808332 0.74380942 0.51640368
## tm12 0.4180800 -0.153871526 0.07795489 0.03974188 -0.19095788 0.19570760
## tm13 -0.4349484 0.680180851 0.02729910 -0.03292885 -0.41263801 -0.33047056
## tm14 -0.5436615 -0.284738966 0.78172991 -0.03341614 -0.47663173 -0.49365229
## tm15 -0.4371555 -0.176737605 0.21585849 0.53632580 -0.38530014 -0.34316675
## tm21 0.4445735 0.001492836 -0.60757066 -0.21271758 0.77452026 0.44433797
## tm22 0.7701805 -0.071404515 -0.29783185 -0.05167199 0.13730328 0.59618013
## tm23 -0.3116562 0.759941916 -0.07694551 -0.15572473 -0.26723534 -0.24730151
## tm24 -0.5545853 -0.413312529 0.91265769 -0.04479833 -0.54757569 -0.52459782
## tm25 -0.3357767 -0.324377432 0.02464586 0.77690344 -0.27479792 -0.27357423
## tm31 0.3920233 -0.065093255 -0.53910874 -0.33424709 0.87051875 0.45884572
## tm32 1.0000000 -0.067968996 -0.52532615 -0.11755072 0.36395132 0.76322286
## tm33 -0.0679690 1.000000000 -0.39022417 -0.30611586 -0.06017394 -0.01653481
## tm34 -0.5253262 -0.390224173 1.00000000 -0.23621390 -0.50933749 -0.51672972
## tm35 -0.1175507 -0.306115862 -0.23621390 1.00000000 -0.24403067 -0.11038990
## tm41 0.3639513 -0.060173942 -0.50933749 -0.24403067 1.00000000 0.41526891
## tm42 0.7632229 -0.016534813 -0.51672972 -0.11038990 0.41526891 1.00000000
## tm43 0.1957674 0.766959630 -0.59517644 -0.21051713 0.17557327 0.20644735
## tm44 -0.4097170 -0.238785411 0.90465796 -0.41125472 -0.40996725 -0.41241333
## tm45 -0.1990964 -0.206948274 -0.11417688 0.79469708 -0.39888368 -0.33310065
## tm51 0.3378027 -0.075457900 -0.42910252 -0.18349862 0.72609825 0.43061903
## tm52 0.6607518 -0.086414418 -0.41213068 -0.18383325 0.49095592 0.68040461
## tm53 0.3483652 0.582687309 -0.58258090 -0.30332533 0.35861435 0.35678119
## tm54 -0.2112327 -0.145036207 0.65272646 -0.50172406 -0.20401781 -0.18486343
## tm55 -0.4007454 -0.195914446 0.20696996 0.65245809 -0.52192641 -0.46776174
## tm43 tm44 tm45 tm51 tm52 tm53
## tm11 0.19723799 -0.518138672 -0.1881577465 0.68271292 0.58496607 0.34626730
## tm12 -0.13439818 0.072524705 0.0340681538 -0.15285300 0.07881676 -0.09894962
## tm13 0.41016068 0.015177891 0.0802973361 -0.39745324 -0.42942157 0.16625074
## tm14 -0.48301807 0.712688000 -0.0007170447 -0.42925702 -0.42401380 -0.47733802
## tm15 -0.21689899 0.097159097 0.4015408998 -0.36858968 -0.34852914 -0.30907783
## tm21 0.25259585 -0.501308413 -0.2315428573 0.75078641 0.52356758 0.43564708
## tm22 0.06446007 -0.216868615 -0.1291167824 0.13556447 0.47380970 0.14828763
## tm23 0.53518905 -0.040972685 -0.0586544943 -0.32371901 -0.35252887 0.26335535
## tm24 -0.59979144 0.801598842 0.0309461132 -0.48454464 -0.43886242 -0.59044804
## tm25 -0.30218003 -0.133444503 0.6284451034 -0.25410238 -0.26943473 -0.36161706
## tm31 0.22126395 -0.431377891 -0.3504202565 0.71656721 0.51213840 0.39899581
## tm32 0.19576739 -0.409716997 -0.1990964110 0.33780274 0.66075180 0.34836519
## tm33 0.76695963 -0.238785411 -0.2069482741 -0.07545790 -0.08641442 0.58268731
## tm34 -0.59517644 0.904657958 -0.1141768832 -0.42910252 -0.41213068 -0.58258090
## tm35 -0.21051713 -0.411254717 0.7946970788 -0.18349862 -0.18383325 -0.30332533
## tm41 0.17557327 -0.409967254 -0.3988836849 0.72609825 0.49095592 0.35861435
## tm42 0.20644735 -0.412413329 -0.3331006464 0.43061903 0.68040461 0.35678119
## tm43 1.00000000 -0.506015574 -0.2903541659 0.05250221 0.12426093 0.74862831
## tm44 -0.50601557 1.000000000 -0.3835263055 -0.30708158 -0.29483798 -0.49727548
## tm45 -0.29035417 -0.383526305 1.0000000000 -0.28309076 -0.31803041 -0.29812653
## tm51 0.05250221 -0.307081584 -0.2830907588 1.00000000 0.49850926 0.17839715
## tm52 0.12426093 -0.294837981 -0.3180304080 0.49850926 1.00000000 0.30530846
## tm53 0.74862831 -0.497275477 -0.2981265291 0.17839715 0.30530846 1.00000000
## tm54 -0.27670540 0.746700844 -0.4652100734 -0.17363570 -0.09384303 -0.32151846
## tm55 -0.32149457 0.007023986 0.7052339317 -0.54923452 -0.62067363 -0.54777104
## tm54 tm55
## tm11 -0.28151009 -0.471804221
## tm12 0.06973283 0.052139616
## tm13 -0.02011054 0.204095525
## tm14 0.45377923 0.281980140
## tm15 -0.07149219 0.489314561
## tm21 -0.28670837 -0.532071436
## tm22 -0.07386091 -0.234833825
## tm23 -0.06894878 0.125642182
## tm24 0.55844981 0.306337574
## tm25 -0.24468997 0.567752758
## tm31 -0.16962722 -0.571852312
## tm32 -0.21123271 -0.400745438
## tm33 -0.14503621 -0.195914446
## tm34 0.65272646 0.206969960
## tm35 -0.50172406 0.652458093
## tm41 -0.20401781 -0.521926413
## tm42 -0.18486343 -0.467761742
## tm43 -0.27670540 -0.321494566
## tm44 0.74670084 0.007023986
## tm45 -0.46521007 0.705233932
## tm51 -0.17363570 -0.549234516
## tm52 -0.09384303 -0.620673632
## tm53 -0.32151846 -0.547771044
## tm54 1.00000000 -0.384719975
## tm55 -0.38471997 1.000000000
ggplot(as.data.frame(as.table(cor(df[63:87]))), aes(Var1, Var2, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
theme_minimal() +
labs(title = "TM Variables Correlation Heatmap", x = "", y = "")
After running K-means clustering on the 12 BH variables, we perform 2 levels of validations:
Level 1: Run Principle Component Analysis (PCA) on the BH variables and plot the clusters against the first 2-3 PCs. We can check whether there’s clear margins between the clusters.
Level 2: Check descriptive qualities of each cluster through variables such as gini index and pre-derived IGM measures (CAT, YOS, COR, BETA).We run ANOVA to see if there’s statistically significant difference between each cluster.
# create plot of number of clusters vs total within sum of squares
fviz_nbclust(scale(df[47:58]), kmeans, method = "wss")
# k-means with k = 4
set.seed(123)
km.res_BH <- kmeans(scale(df[47:58]), 4, nstart = 25)
km.res_BH
## K-means clustering with 4 clusters of sizes 42, 26, 63, 22
##
## Cluster means:
## BHQ1_randomtiebreak BHQ1_lb BHQ1_ub BHQ2_randomtiebreak BHQ2_lb
## 1 1.0322240 1.0692290 -0.01700147 0.5074830 0.7745319
## 2 -1.5175469 -1.5718189 0.67182677 -0.5998219 -1.3974493
## 3 -0.1542744 0.1456727 -0.67353069 -0.3760416 0.4194482
## 4 0.2646407 -0.6008047 1.16722726 0.8168956 -1.0282680
## BHQ2_ub BHQ3_randomtiebreak BHQ3_lb BHQ3_ub BHQ4_randomtiebreak
## 1 -0.5717331 -0.8511281 0.4924190 -0.8292556 -0.9697621
## 2 1.4589229 1.2061701 -1.4408292 1.7722619 1.3655536
## 3 -0.5886225 0.1550720 0.6157683 -0.3673223 0.3060670
## 4 1.0529095 -0.2446627 -1.0006109 0.5405104 -0.6389365
## BHQ4_lb BHQ4_ub
## 1 -0.01393797 -1.0345410
## 2 -1.15024817 1.5705506
## 3 0.81527670 0.1520359
## 4 -0.94866295 -0.3164480
##
## Clustering vector:
## [1] 4 1 1 1 1 3 1 3 4 4 1 2 2 4 3 1 4 1 2 3 3 1 1 3 1 2 1 3 1 2 3 1 3 4 1 3 3
## [38] 3 2 3 1 2 4 3 1 2 3 3 3 2 2 1 3 4 1 4 2 4 4 4 1 3 4 4 3 2 1 1 1 3 3 3 4 3
## [75] 3 3 3 2 3 1 3 2 3 1 2 1 2 2 3 3 1 3 3 1 2 1 1 1 2 3 3 4 2 3 1 3 4 1 3 1 1
## [112] 3 3 4 1 3 2 3 4 1 4 3 3 3 1 2 1 3 4 3 3 3 3 3 3 2 2 3 2 3 3 2 3 4 1 1 3 3
## [149] 3 1 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 130.6324 117.0119 300.9756 88.1987
## (between_SS / total_SS = 65.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# append cluster assignment to df
df <- cbind(df, BH_cluster = factor(km.res_BH$cluster))
df$BH_cluster
## [1] 4 1 1 1 1 3 1 3 4 4 1 2 2 4 3 1 4 1 2 3 3 1 1 3 1 2 1 3 1 2 3 1 3 4 1 3 3
## [38] 3 2 3 1 2 4 3 1 2 3 3 3 2 2 1 3 4 1 4 2 4 4 4 1 3 4 4 3 2 1 1 1 3 3 3 4 3
## [75] 3 3 3 2 3 1 3 2 3 1 2 1 2 2 3 3 1 3 3 1 2 1 1 1 2 3 3 4 2 3 1 3 4 1 3 1 1
## [112] 3 3 4 1 3 2 3 4 1 4 3 3 3 1 2 1 3 4 3 3 3 3 3 3 2 2 3 2 3 3 2 3 4 1 1 3 3
## [149] 3 1 3 3 3
## Levels: 1 2 3 4
# run PCA and take 2-3 1st PCs to visualize clusters
pca_BH <- prcomp(scale(df[47:58]))
summary(pca_BH)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.6575 1.7654 1.01322 0.67539 0.4450 0.27786 0.1624
## Proportion of Variance 0.5885 0.2597 0.08555 0.03801 0.0165 0.00643 0.0022
## Cumulative Proportion 0.5885 0.8482 0.93379 0.97180 0.9883 0.99473 0.9969
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.13209 0.11705 0.06502 0.03819 5.114e-08
## Proportion of Variance 0.00145 0.00114 0.00035 0.00012 0.000e+00
## Cumulative Proportion 0.99838 0.99953 0.99988 1.00000 1.000e+00
# 1st 2 PCs make up 84.8% cumulative proportion; 1st 3 make up 93.4%
# plot clusters using 1st 2 PCs
pcaBH_df <- data.frame(PC1 = pca_BH$x[,1],
PC2 = pca_BH$x[,2],
PC3 = pca_BH$x[,3],
BH_cluster = df$BH_cluster,
country = df$country)
BH_two_PCs <- ggplot(pcaBH_df, aes(x = PC1, y = PC2, color = BH_cluster)) +
geom_point() +
labs(title = "BH Clusters Against first 2 Principle Components",
x = "Principle Component 1",
y = "Principle Component 2",
color = "Clusters based on BH variables",
fill = "Clusters based on BH variables") +
theme_minimal()
BH_two_PCs
# plot clusters using 1st 3 PCs
BH_three_PCs <- plot_ly(data = pcaBH_df, x = ~PC1, y = ~PC2, z = ~PC3, color = ~factor(BH_cluster),
type = "scatter3d", mode = "markers", marker = list(size = 5)) %>%
layout(scene = list(xaxis = list(title = "Principle Component 1"),
yaxis = list(title = "Principle Component 2"),
zaxis = list(title = "Principle Component 3")),
legend = list(title=list(text='<b> Clusters based on BH variables')),
title = "3D Scatter Plot of PCA with BH Clusters")
BH_three_PCs
# CAT
catBH_anova <- aov(CAT ~ BH_cluster, data = df)
summary(catBH_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## BH_cluster 3 1.829 0.6096 25.03 3.57e-13 ***
## Residuals 149 3.628 0.0244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
catBH_posthoc <- TukeyHSD(catBH_anova)
catBH_posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = CAT ~ BH_cluster, data = df)
##
## $BH_cluster
## diff lwr upr p adj
## 2-1 -0.288947221 -0.39012403 -0.18777041 0.0000000
## 3-1 -0.003397638 -0.08416531 0.07737004 0.9995308
## 4-1 -0.136455765 -0.24316251 -0.02974902 0.0061119
## 3-2 0.285549584 0.19103999 0.38005918 0.0000000
## 4-2 0.152491457 0.03503944 0.26994347 0.0051832
## 4-3 -0.133058127 -0.23346552 -0.03265073 0.0041181
# YOS
yosBH_anova <- aov(YOS ~ BH_cluster, data = df)
summary(yosBH_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## BH_cluster 3 1.283 0.4275 19.87 6.89e-11 ***
## Residuals 149 3.206 0.0215
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
yosBH_posthoc <- TukeyHSD(yosBH_anova)
yosBH_posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = YOS ~ BH_cluster, data = df)
##
## $BH_cluster
## diff lwr upr p adj
## 2-1 -0.26828612 -0.3633943 -0.173177974 0.0000000
## 3-1 -0.04580024 -0.1217234 0.030122929 0.4003979
## 4-1 -0.10888827 -0.2091947 -0.008581886 0.0276226
## 3-2 0.22248588 0.1336450 0.311326720 0.0000000
## 4-2 0.15939785 0.0489907 0.269804996 0.0014226
## 4-3 -0.06308804 -0.1574729 0.031296847 0.3084581
# 1-COR
corBH_anova <- aov(1-COR ~ BH_cluster, data = df)
summary(corBH_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## BH_cluster 3 0.8314 0.2772 46.2 <2e-16 ***
## Residuals 149 0.8938 0.0060
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
corBH_posthoc <- TukeyHSD(corBH_anova)
corBH_posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = 1 - COR ~ BH_cluster, data = df)
##
## $BH_cluster
## diff lwr upr p adj
## 2-1 0.157239020 0.10702133 0.20745670 0.0000000
## 3-1 0.159810949 0.11972305 0.19989885 0.0000000
## 4-1 0.031852488 -0.02110990 0.08481488 0.4031433
## 3-2 0.002571929 -0.04433658 0.04948044 0.9989644
## 4-2 -0.125386532 -0.18368218 -0.06709088 0.0000006
## 4-3 -0.127958461 -0.17779426 -0.07812266 0.0000000
# 1-BETA
betaBH_anova <- aov(1-BETA ~ BH_cluster, data = df)
summary(betaBH_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## BH_cluster 3 2.001 0.6670 34.97 <2e-16 ***
## Residuals 149 2.842 0.0191
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
betaBH_posthoc <- TukeyHSD(betaBH_anova)
betaBH_posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = 1 - BETA ~ BH_cluster, data = df)
##
## $BH_cluster
## diff lwr upr p adj
## 2-1 -0.10741081 -0.19695290 -0.017868711 0.0116575
## 3-1 0.16345432 0.09197444 0.234934209 0.0000001
## 4-1 -0.09687715 -0.19131326 -0.002441033 0.0420049
## 3-2 0.27086513 0.18722356 0.354506698 0.0000000
## 4-2 0.01053366 -0.09341209 0.114479403 0.9935825
## 4-3 -0.26033147 -0.34919263 -0.171470315 0.0000000
# Gini
gini_BH_anova <- aov(gini ~ BH_cluster, data = df)
summary(gini_BH_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## BH_cluster 3 1089 362.9 7.215 0.000149 ***
## Residuals 149 7495 50.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gini_BH_posthoc <- TukeyHSD(gini_BH_anova)
gini_BH_posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = gini ~ BH_cluster, data = df)
##
## $BH_cluster
## diff lwr upr p adj
## 2-1 -2.687509 -7.2859809 1.9109626 0.4290286
## 3-1 -6.214127 -9.8850064 -2.5432476 0.0001206
## 4-1 -1.299048 -6.1488539 3.5507586 0.8984982
## 3-2 -3.526618 -7.8220656 0.7688299 0.1474059
## 4-2 1.388462 -3.9497157 6.7266388 0.9061148
## 4-3 4.915079 0.3515775 9.4785813 0.0293654
# calculate mean and standard deviation for CAT scores within each cluster
catBH_summary <- df %>%
group_by(BH_cluster) %>%
summarize(mean_CAT = mean(CAT),
sd_CAT = sd(CAT),
n = n()) %>%
mutate(se = sd_CAT / sqrt(n),
ci_lower = mean_CAT - qt(0.975, n - 1) * se,
ci_upper = mean_CAT + qt(0.975, n - 1) * se)
catBH_summary
# create jittered strip plot with mean points and error bars
catBH_plot <- ggplot(df, aes(x = BH_cluster, y = CAT, color = BH_cluster)) +
geom_jitter(width = 0.2, alpha = 0.7) +
stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +
labs(x = "Cluster", y = "CAT", title = "CAT Scores by BH Cluster") +
scale_color_manual(values = c("red", "green", "blue", "orange")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
catBH_plot
# calculate mean and standard deviation for YOS within each cluster
yosBH_summary <- df %>%
group_by(BH_cluster) %>%
summarize(mean_YOS = mean(YOS),
sd_YOS = sd(YOS))
yosBH_summary
# create jittered strip plot with mean lines
yosBH_plot <- ggplot(df, aes(x = BH_cluster, y = YOS, color = BH_cluster)) +
geom_jitter(width = 0.2, alpha = 0.7) +
stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +
labs(x = "Cluster", y = "YOS", title = "YOS Scores by BH Cluster") +
scale_color_manual(values = c("red", "green", "blue", "orange")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
yosBH_plot
# calculate mean and standard deviation for 1-COR within each cluster
corBH_summary <- df %>%
group_by(BH_cluster) %>%
summarize(mean_1_COR = mean(1-COR),
sd_1_COR = sd(1-COR))
corBH_summary
# create jittered strip plot with mean lines
corBH_plot <- ggplot(df, aes(x = BH_cluster, y = 1-COR, color = BH_cluster)) +
geom_jitter(width = 0.2, alpha = 0.7) +
stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +
labs(x = "Cluster", y = "1-COR", title = "1-COR Scores by BH Cluster") +
scale_color_manual(values = c("red", "green", "blue", "orange")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
corBH_plot
# calculate mean and standard deviation for 1-BETA within each cluster
betaBH_summary <- df %>%
group_by(BH_cluster) %>%
summarize(mean_1_BETA = mean(1-BETA),
sd_1_BETA = sd(1-BETA))
betaBH_summary
# create the strip plot with mean lines
betaBH_plot <- ggplot(df, aes(x = BH_cluster, y = 1-BETA, color = BH_cluster)) +
geom_jitter(width = 0.2, alpha = 0.7) +
stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +
labs(x = "Cluster", y = "1-BETA", title = "1-BETA Scores by BH Cluster") +
scale_color_manual(values = c("red", "green", "blue", "orange")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
betaBH_plot
# calculate mean and standard deviation for gini index within each cluster
gini_BH_summary <- df %>%
group_by(BH_cluster) %>%
summarize(mean_gini = mean(gini),
sd_gini = sd(gini))
gini_BH_summary
# create the strip plot with mean lines
gini_BH_plot <- ggplot(df, aes(x = BH_cluster, y = gini, color = BH_cluster)) +
geom_jitter(width = 0.2, alpha = 0.7) +
stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +
labs(x = "Cluster", y = "Gini", title = "Gini Index by BH Cluster") +
scale_color_manual(values = c("red", "green", "blue", "orange")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
gini_BH_plot
After running K-means clustering on the 25 BH variables, we perform 2 levels of validations:
Level 1: Run Principle Component Analysis (PCA) on the TM variables and plot the clusters against the first 2-3 PCs. We can check whether there’s clear margins between the clusters.
Level 2: Check descriptive qualities of each cluster through variables such as gini index and pre-derived IGM measures (CAT, YOS, COR, BETA).We run ANOVA to see if there’s statistically significant difference between each cluster.
# create plot of number of clusters vs total within sum of squares
fviz_nbclust(scale(df[63:87]), kmeans, method = "wss")
# k-means with k = 4
set.seed(234)
km.res.TM <- kmeans(scale(df[63:87]), 4, nstart = 25)
km.res.TM
## K-means clustering with 4 clusters of sizes 7, 46, 50, 50
##
## Cluster means:
## tm11 tm12 tm13 tm14 tm15 tm21
## 1 -0.2796238 -0.59317254 2.3811044 -0.96516041 -0.65992203 0.1117685
## 2 -0.4254435 0.16585453 0.1627500 0.02491049 0.76048609 -0.4476897
## 3 1.1320839 -0.15499211 -0.7024904 -0.72311271 -0.65324955 1.0611085
## 4 -0.7015285 0.08545009 0.2194058 0.83531752 0.04599143 -0.6648815
## tm22 tm23 tm24 tm25 tm31 tm32
## 1 -0.5568371 2.576303724 -1.36567514 -0.9837743 -0.08658153 -0.05765707
## 2 -0.1345903 0.002895838 0.02157411 0.9696476 -0.49049473 -0.28739247
## 3 0.5060023 -0.499091374 -0.78069007 -0.5089787 1.08513258 0.84120851
## 4 -0.3042220 0.135744681 0.95203641 -0.2453686 -0.62175602 -0.56873544
## tm33 tm34 tm35 tm41 tm42 tm43 tm44
## 1 3.1520051 -1.4170509 -0.9000162 -0.3491745 0.06535315 3.2453696 -1.2627014
## 2 -0.2187402 -0.2041437 1.1118133 -0.4581438 -0.29859942 -0.1784015 -0.4617618
## 3 -0.1174763 -0.7221351 -0.3553730 1.0349993 0.89980544 0.2124526 -0.4901917
## 4 -0.1225635 1.1083344 -0.5414929 -0.5646225 -0.63424341 -0.5026750 1.0917907
## tm45 tm51 tm52 tm53 tm54 tm55
## 1 -0.5932604 -0.4367202 -0.6303033 2.6704574 -0.9017447 -0.63070070
## 2 1.0320114 -0.4037488 -0.4645420 -0.3550004 -0.5763151 0.90772260
## 3 -0.5587157 0.9587524 0.9905800 0.4460539 -0.2455907 -0.78966772
## 4 -0.3076783 -0.5261628 -0.4749588 -0.4933176 0.9020448 0.04286102
##
## Clustering vector:
## [1] 3 2 2 4 4 2 4 4 2 4 2 3 3 2 4 1 2 4 3 3 3 2 3 2 3 3 2 3 2 3 3 1 3 3 4 2 4
## [38] 2 3 3 2 2 3 4 1 3 4 2 2 3 3 4 4 3 4 3 3 3 2 3 4 2 2 3 2 2 2 4 4 4 2 4 3 4
## [75] 2 4 4 3 4 2 2 3 4 4 3 2 2 3 3 2 2 4 4 4 3 3 2 1 3 4 2 3 3 3 4 2 3 2 3 4 4
## [112] 4 4 2 4 4 3 1 3 4 4 4 4 2 4 3 2 4 2 4 4 2 4 3 2 3 3 1 3 4 1 3 2 2 4 4 4 3
## [149] 4 2 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 109.4936 537.5893 827.8046 552.5461
## (between_SS / total_SS = 46.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# append cluster assignment to df
df <- cbind(df, TM_cluster = factor(km.res.TM$cluster))
# run PCA and take 2-3 1st PCs to visualize clusters
pca_TM <- prcomp(scale(df[63:87]))
summary(pca_TM)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.0515 2.0666 1.9912 1.5971 1.03791 0.78282 0.69863
## Proportion of Variance 0.3725 0.1708 0.1586 0.1020 0.04309 0.02451 0.01952
## Cumulative Proportion 0.3725 0.5433 0.7019 0.8039 0.84701 0.87152 0.89105
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.68239 0.62885 0.55618 0.53666 0.51012 0.45320 0.40843
## Proportion of Variance 0.01863 0.01582 0.01237 0.01152 0.01041 0.00822 0.00667
## Cumulative Proportion 0.90967 0.92549 0.93786 0.94938 0.95979 0.96801 0.97468
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.40519 0.37007 0.34944 0.30308 0.2644 0.21897 1.378e-08
## Proportion of Variance 0.00657 0.00548 0.00488 0.00367 0.0028 0.00192 0.000e+00
## Cumulative Proportion 0.98125 0.98673 0.99161 0.99529 0.9981 1.00000 1.000e+00
## PC22 PC23 PC24 PC25
## Standard deviation 1.159e-08 1.051e-08 8.882e-09 8.357e-09
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00
# 1st 3 PCs make up 70.2% cumulative proportion
# plot 4 TM clusters using 1st 3 PCs
pca_TM_df <- data.frame(PC1 = pca_TM$x[,1],
PC2 = pca_TM$x[,2],
PC3 = pca_TM$x[,3],
TM_cluster = df$TM_cluster,
country = df$country)
TM_three_PCs <- plot_ly(data = pca_TM_df, x = ~PC1, y = ~PC2, z = ~PC3, color = ~factor(TM_cluster),
type = "scatter3d", mode = "markers", marker = list(size = 5)) %>%
layout(scene = list(xaxis = list(title = "Principle Component 1"),
yaxis = list(title = "Principle Component 2"),
zaxis = list(title = "Principle Component 3")),
title = "3D Scatter Plot of 3PCs on TM Clusters")
TM_three_PCs
# CAT
cat_TM_anova <- aov(CAT ~ TM_cluster, data = df)
summary(cat_TM_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## TM_cluster 3 3.144 1.0480 67.51 <2e-16 ***
## Residuals 149 2.313 0.0155
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
catTM_posthoc <- TukeyHSD(cat_TM_anova)
catTM_posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = CAT ~ TM_cluster, data = df)
##
## $TM_cluster
## diff lwr upr p adj
## 2-1 0.16197756 0.03064071 0.29331440 0.0088949
## 3-1 -0.18401231 -0.31465346 -0.05337117 0.0019659
## 4-1 0.07321327 -0.05742787 0.20385442 0.4666712
## 3-2 -0.34598987 -0.41212745 -0.27985229 0.0000000
## 4-2 -0.08876429 -0.15490186 -0.02262671 0.0035546
## 4-3 0.25722559 0.19248054 0.32197064 0.0000000
# YOS
yos_TM_anova <- aov(YOS ~ TM_cluster, data = df)
summary(yos_TM_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## TM_cluster 3 1.937 0.6457 37.71 <2e-16 ***
## Residuals 149 2.551 0.0171
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
yosTM_posthoc <- TukeyHSD(yos_TM_anova)
yosTM_posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = YOS ~ TM_cluster, data = df)
##
## $TM_cluster
## diff lwr upr p adj
## 2-1 0.11364584 -0.02429643 0.25158811 0.1451455
## 3-1 -0.16437960 -0.30159118 -0.02716802 0.0118019
## 4-1 0.01936328 -0.11784830 0.15657486 0.9830921
## 3-2 -0.27802544 -0.34748932 -0.20856156 0.0000000
## 4-2 -0.09428255 -0.16374644 -0.02481867 0.0031114
## 4-3 0.18374289 0.11574156 0.25174421 0.0000000
# 1-COR
cor_TM_anova <- aov(1-COR ~ TM_cluster, data = df)
summary(cor_TM_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## TM_cluster 3 0.0271 0.009028 0.792 0.5
## Residuals 149 1.6982 0.011397
corTM_posthoc <- TukeyHSD(cor_TM_anova)
corTM_posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = 1 - COR ~ TM_cluster, data = df)
##
## $TM_cluster
## diff lwr upr p adj
## 2-1 -0.0014503713 -0.11398628 0.11108554 0.9999865
## 3-1 0.0271851557 -0.08475464 0.13912495 0.9219128
## 4-1 -0.0010821991 -0.11302200 0.11085760 0.9999943
## 3-2 0.0286355270 -0.02803442 0.08530548 0.5561888
## 4-2 0.0003681721 -0.05630178 0.05703812 0.9999983
## 4-3 -0.0282673549 -0.08374412 0.02720941 0.5492205
# 1-BETA
beta_TM_anova <- aov(1-BETA ~ TM_cluster, data = df)
summary(beta_TM_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## TM_cluster 3 0.932 0.31065 11.84 5.36e-07 ***
## Residuals 149 3.911 0.02625
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
betaTM_posthoc <- TukeyHSD(beta_TM_anova)
betaTM_posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = 1 - BETA ~ TM_cluster, data = df)
##
## $TM_cluster
## diff lwr upr p adj
## 2-1 -0.11122661 -0.28200302 0.05954979 0.3313954
## 3-1 -0.22554284 -0.39541463 -0.05567105 0.0040286
## 4-1 -0.04631929 -0.21619109 0.12355250 0.8936102
## 3-2 -0.11431623 -0.20031447 -0.02831799 0.0039744
## 4-2 0.06490732 -0.02109092 0.15090556 0.2075414
## 4-3 0.17922355 0.09503600 0.26341110 0.0000008
# Gini
gini_TM_anova <- aov(gini ~ TM_cluster, data = df)
summary(gini_TM_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## TM_cluster 3 1454 484.8 10.13 4.09e-06 ***
## Residuals 149 7129 47.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
giniTM_posthoc <- TukeyHSD(gini_TM_anova)
giniTM_posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = gini ~ TM_cluster, data = df)
##
## $TM_cluster
## diff lwr upr p adj
## 2-1 -9.243727 -16.5353003 -1.952153 0.0067033
## 3-1 -4.772857 -12.0258069 2.480093 0.3222284
## 4-1 -10.884857 -18.1378069 -3.631907 0.0008309
## 3-2 4.470870 0.7990362 8.142703 0.0100952
## 4-2 -1.641130 -5.3129638 2.030703 0.6521365
## 4-3 -6.112000 -9.7065229 -2.517477 0.0001114
# calculate mean and standard deviation for CAT scores within each cluster
catTM_summary <- df %>%
group_by(TM_cluster) %>%
summarize(mean_CAT = mean(CAT),
sd_CAT = sd(CAT),
n = n()) %>%
mutate(se = sd_CAT / sqrt(n),
ci_lower = mean_CAT - qt(0.975, n - 1) * se,
ci_upper = mean_CAT + qt(0.975, n - 1) * se)
catTM_summary
# create jittered strip plot with mean points and error bars
catTM_plot <- ggplot(df, aes(x = factor(TM_cluster), y = CAT, color = TM_cluster)) +
geom_jitter(width = 0.2, alpha = 0.7) +
stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +
labs(x = "Cluster", y = "CAT", title = "TM CAT Scores by Cluster") +
scale_color_manual(values = c("red", "green", "blue", "orange")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
catTM_plot
# calculate mean and standard deviation for YOS within each cluster
yosTM_summary <- df %>%
group_by(TM_cluster) %>%
summarize(mean_YOS = mean(YOS),
sd_YOS = sd(YOS))
yosTM_summary
# create jittered strip plot with mean lines
yos_plot <- ggplot(df, aes(x = TM_cluster, y = YOS, color = TM_cluster)) +
geom_jitter(width = 0.2, alpha = 0.7) +
stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +
labs(x = "Cluster", y = "YOS", title = "YOS Scores by Cluster") +
scale_color_manual(values = c("red", "green", "blue", "orange")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
yos_plot
# calculate mean and standard deviation for 1-COR within each cluster
corTM_summary <- df %>%
group_by(TM_cluster) %>%
summarize(mean_1_COR = mean(1-COR),
sd_1_COR = sd(1-COR))
corTM_summary
# create jittered strip plot with mean lines
corTM_plot <- ggplot(df, aes(x = TM_cluster, y = 1-COR, color = TM_cluster)) +
geom_jitter(width = 0.2, alpha = 0.7) +
stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +
labs(x = "Cluster", y = "1-COR", title = "1-COR Scores by Cluster") +
scale_color_manual(values = c("red", "green", "blue", "orange")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
corTM_plot
# calculate mean and standard deviation for 1-BETA within each cluster
betaTM_summary <- df %>%
group_by(TM_cluster) %>%
summarize(mean_1_BETA = mean(1-BETA),
sd_1_BETA = sd(1-BETA))
betaTM_summary
# create jittered strip plot with mean lines
betaTM_plot <- ggplot(df, aes(x = TM_cluster, y = 1-BETA, color = TM_cluster)) +
geom_jitter(width = 0.2, alpha = 0.7) +
stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +
labs(x = "Cluster", y = "1-BETA", title = "1-BETA Scores by Cluster") +
scale_color_manual(values = c("red", "green", "blue", "orange")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
betaTM_plot
# calculate mean and standard deviation for gini index within each cluster
giniTM_summary <- df %>%
group_by(TM_cluster) %>%
summarize(mean_gini = mean(gini),
sd_gini = sd(gini))
giniTM_summary
# create the strip plot with mean lines
giniTM_plot <- ggplot(df, aes(x = TM_cluster, y = gini, color = TM_cluster)) +
geom_jitter(width = 0.2, alpha = 0.7) +
stat_summary(fun = mean, geom = "crossbar", position = position_dodge(width = 0.2), color = "black", fill = "black", width = 0.5) +
labs(x = "Cluster", y = "Gini", title = "Gini Index by TM Cluster") +
scale_color_manual(values = c("red", "green", "blue", "orange")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
giniTM_plot
We’re creating a matrix to see how BH clusters and TM clusters overlap with each other.
# we're using a confusion matrix as it uses the sample principle
confusionMatrix(df$BH_cluster, df$TM_cluster)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 4 15 3 20
## 2 0 3 23 0
## 3 3 20 12 28
## 4 0 8 12 2
##
## Overall Statistics
##
## Accuracy : 0.1373
## 95% CI : (0.087, 0.2021)
## No Information Rate : 0.3268
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.143
##
## Mcnemar's Test P-Value : 5.63e-09
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.57143 0.06522 0.24000 0.04000
## Specificity 0.73973 0.78505 0.50485 0.80583
## Pos Pred Value 0.09524 0.11538 0.19048 0.09091
## Neg Pred Value 0.97297 0.66142 0.57778 0.63359
## Prevalence 0.04575 0.30065 0.32680 0.32680
## Detection Rate 0.02614 0.01961 0.07843 0.01307
## Detection Prevalence 0.27451 0.16993 0.41176 0.14379
## Balanced Accuracy 0.65558 0.42513 0.37243 0.42291